/*******************************************************************************

  Paying Outsourced Labor: Direct Evidence from Linked Temp Agency-Worker-Client Data

  By Andres Drenik, Simon Jäger, Pascuel Plotkin and Benjamin Schoefer
	January 7th, 2021

	DESCRIPTION: Creates dataset for AKM estimation and analysis and runs the akm model.

*******************************************************************************/




/********************************************************************************
***** Preliminaries
********************************************************************************/
set more off
cap log close
local curr_date = c(current_date)
log using "${logs}/09_AKM_Preparation`curr_date'", replace


/********************************************************************************
***** Load Clean Dataset
********************************************************************************/

use "${intermediate_data_emp}/full_Dataset_Clean.dta", clear


/****************************************************************************************
* Merge with dataset that contains a flag to separate workers into 2 random groups
****************************************************************************************/

	*Merge to separate the workers in two random groups defined before
	merge m:1 cuil_trab using "${intermediate_data_emp}/workers_group.dta", keep(matched) nogen

/****************************************************************************************

* Additional Variables for Specification of Equation (2)
	* Differentiate when a firm is acting as User Firm from when it is acting as a
	  Regular Firm by adding an extra digit to the Firm ID
	* Generate a new variable of Firm ID to identify Temp Agencies in a temp worker spell
	* Generate cuadratic and cubic variables for Age
	* Generate Variables for weighting

****************************************************************************************/

	* Add an extra digit on the user firms to differentiate them from the regular firms. Later recover the original ID just dividing by 10
	* Firm ID (CUIT) is standarized and has 11 digits: ## - XXXXXXXXX, where ## = 30, 33 or 34
	gen double cuit_reg3 = cuit_empl if temp_worker == 0
	replace cuit_reg3 = cuit_user_firm if temp_worker == 1
	replace cuit_reg3 = cuit_reg3*10 if temp_worker == 1

	* Generate the variable for an extra Fixed Effect for the temp agencies
	gen double temp_agency_id = cuit_empl if temp_worker == 1
	replace temp_agency_id = 999999999998 if temp_worker == 0

	*Experience cuadratic and cubic
	gen age_cuadratic = age^2
	gen age_cubic = age^3

	*Generate variables for weighting
	gen obs = 1
	gen obs_eventual = (temp_worker == 0 & modalidad == 12)
	gen obs_intern = (temp_worker == 0 & modalidad == 10)
	replace obs_inter = 1 if temp_worker == 0 & modalidad == 27
	replace obs_inter = 1 if temp_worker == 0 & modalidad == 51
	gen obs_temp = (temp_worker == 1)


/****************************************************************************************
* Collapse to generate a Dataset with 1 observation per spell-year
****************************************************************************************/
	gcollapse (mean) log_real_wage age age_cuadratic age_cubic temp_agency_id temp_worker real_firm cuit_empl group ciiu_4_user gender modalidad (sum) obs obs_eventual obs_inter obs_temp, by(cuil_trab cuit_reg3 year)

	compress

/****************************************************************************************

* Estimate the AKM specification estimating separate FE for firm when acting
as User Firms and when acting Regular Firms (Temp agency FE not included)
	* Run a Regression with the whole sample
	* Run a Regression with the sample split in 2 random groups of workers
	* Run a Regression with the sample split in 2 periods (2009 - 2013 / 2014 - 2017)

****************************************************************************************/

	* Run Regression with whole sample
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs], abs(firm_fe3=cuit_reg3 worker_fe3=cuil_trab year) groupvar(connected_set) compact poolsize(10)

	* Generate a variable to identify the connected set of each User Firm
	gen connected_set_uf = connected_set if temp_worker == 1

	*We first generate a variable for the 2 periods in which we analyze the changes
	gen date_group = (year>2013)

	* Run Regression splitting sample in 2 random groups of workers (this will be used for IV to check for measurment error)
	*Group 0
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if group == 0, abs(firm_fe3_g0=cuit_reg3 worker_fe3_g0=cuil_trab year)

	*Group 1
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if group == 1, abs(firm_fe3_g1=cuit_reg3 worker_fe3_g1=cuil_trab year)

	* Run Regression splitting sample in 2 periods
	* 2009 - 2013
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if date_group == 0, abs(firm_fe3_gd0=cuit_reg3 worker_fe3_gd0=cuil_trab year)

	* 2014 - 2017
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if date_group == 1, abs(firm_fe3_gd1=cuit_reg3 worker_fe3_gd1=cuil_trab year)

/****************************************************************************************
* Prepare Dataset for Regression specification that includes Temp Agency FE
	* Identify firms that can potentially generate collinearity and discard them
****************************************************************************************/

	* Merge with Dataset that has the number of temp agencies that each User Firm relates with
	merge m:1 temp_agency_id using "${intermediate_data_emp}/multicolinearity_check_akm4.dta", nogen

	* Check the amount of User Firms per Temp Agency
	unique real_firm if real_firm != temp_agency_id, by(temp_agency_id) gen(unique1)
	gegen unique_1 = sum(unique1), by(temp_agency_id)
	drop unique1

	* If the amount of User Firms per Temp Agency is equal to the user firms that ONLY have relations with that temp agency then we discard them
	gen collinear_obs = (unique_1 == total)

	* Also discard Temp Agencies that only have relations with 1 User Firm
	replace collinear_obs = 1 if unique_1 == 1

/****************************************************************************************

* Estimate the AKM specification estimating separate FE for firm when acting
as User Firms and when acting Regular Firms (including Temp agency FE)
	* Run a Regression with the whole sample
	* Run a Regression with the sample split in 2 random groups of workers
	* Run a Regression with the sample split in 2 periods (2009 - 2013 / 2014 - 2017)

****************************************************************************************/

	* Run Regression with whole sample
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if collinear_obs==0, abs(firm_fe4=cuit_reg3 worker_fe4=cuil_trab temp_fe4=temp_agency_id year) compact poolsize(10)

	*Replace non Temp Agency FE to missing
	replace temp_fe4 = . if temp_agency_id == 999999999998

	* Run Regression splitting sample in 2 random groups of workers (this will be used for IV to check for measurment error)
	* Group 0
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if collinear_obs==0 & group == 0, abs(firm_fe4_g0=cuit_reg3 worker_fe4_g0=cuil_trab temp_fe4_g0=temp_agency_id year) compact poolsize(10)

	* Group 1
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if collinear_obs==0 & group == 1, abs(firm_fe4_g1=cuit_reg3 worker_fe4_g1=cuil_trab temp_fe4_g1=temp_agency_id year) compact poolsize(10)

	* Replace non temp agency FE to missing
	replace temp_fe4_g0 = . if temp_agency_id == 999999999998

	* Replace non temp agency FE to missing
	replace temp_fe4_g1 = . if temp_agency_id == 999999999998

	* Run Regression splitting sample in 2 periods
	* 2009 - 2013
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if collinear_obs==0 & date_group == 0, abs(firm_fe4_gd0=cuit_reg3 worker_fe4_gd0=cuil_trab temp_fe4_gd0=temp_agency_id year) compact poolsize(10)

	* 2014 - 2017
	reghdfe log_real_wage age age_cuadratic age_cubic [fw=obs] if collinear_obs==0 & date_group == 1, abs(firm_fe4_gd1=cuit_reg3 worker_fe4_gd1=cuil_trab temp_fe4_gd1=temp_agency_id year) compact poolsize(10)

	****************
	******** Generate auxiliar Dataset for User Firm FE

	* Identify firms when User Firms
	* We divide by 100000000000 to recover the firm id's to which we previously added an extra digit (user-firms)
	gen aux_UF = cuit_reg3 / 100000000000

	preserve
	
	* Only keep user firms
	keep cuit_reg3 firm_fe3 firm_fe3_g0 firm_fe3_g1 firm_fe4 firm_fe4_g0 firm_fe4_g1 firm_fe3_gd0 firm_fe3_gd1 firm_fe4_gd0 firm_fe4_gd1 aux_UF
	drop if aux_UF < 1
	replace cuit_reg3 = cuit_reg3 / 10
	rename firm_fe3 uf_fe3
	rename firm_fe3_g0 uf_fe3_g0
	rename firm_fe3_g1 uf_fe3_g1
	rename firm_fe3_gd0 uf_fe3_gd0
	rename firm_fe3_gd1 uf_fe3_gd1
	rename firm_fe4 uf_fe4
	rename firm_fe4_g0 uf_fe4_g0
	rename firm_fe4_g1 uf_fe4_g1
	rename firm_fe4_gd0 uf_fe4_gd0
	rename firm_fe4_gd1 uf_fe4_gd1

	* Collapse into auxiliar Dataset
	gcollapse (mean) uf_fe3 uf_fe3_g0 uf_fe3_g1 uf_fe3_gd0 uf_fe3_gd1 uf_fe4 uf_fe4_g0 uf_fe4_g1 uf_fe4_gd0 uf_fe4_gd1, by(cuit_reg3)

	save "${intermediate_data_emp}/uf_fe.dta", replace

	restore

	*************
	* Re-scale the firm id for the user firms to their original id
	replace cuit_reg3 = cuit_reg3/10 if temp_worker == 1

	* Merge with the dataset that has a new variable for user firm FE per firm
	merge m:1 cuit_reg3 using "${intermediate_data_emp}/uf_fe.dta", nogen

	* Intermediate save
	save "${intermediate_data_emp}/dataset_AKM_estimation.dta", replace

/****************************************************************************************
* Prepare Post-Estimation Dataset For Analysis
****************************************************************************************/

	preserve

	* Collapse into one observation per firm
	gcollapse (mean) firm_fe3 firm_fe3_g0 firm_fe3_g1 firm_fe3_gd0 firm_fe3_gd1 firm_fe4 firm_fe4_g0 firm_fe4_g1 firm_fe4_gd0 firm_fe4_gd1 uf_fe3 uf_fe3_g0 uf_fe3_g1 uf_fe3_gd0 uf_fe3_gd1 uf_fe4 uf_fe4_g0 uf_fe4_g1 uf_fe4_gd0 uf_fe4_gd1 ciiu_4_user connected_set connected_set_uf (sum) obs obs_temp, by(cuit_reg3)

	* Replace connected set variable into missing for non-largest connected set observations
	replace connected_set = . if connected_set == 0
	replace connected_set_uf = . if connected_set_uf == 0

	* Merge dataset that contains regular worker tenure by firm
	merge 1:1 cuit_reg3 using "$pathinit_intermediate\reg_tenure.dta", nogen

	*Generate scrambled IDs for firms for potential export
	drop if cuit_reg3 == 0 | cuit_reg3 == .
	gegen firm_id = group(cuit_reg3)
	drop cuit_reg3

	*Save Dataset
	save "${intermediate_data_emp}/AKM_Firm_FE_for_export_stata.dta", replace

	restore

log close
